0. PACKAGE LOAD AND MARKDOWN CONFIGURATION

library(caret)
library(future)
library(doParallel)
library(heatmaply)
library(factoextra)
library(FactoMineR)
library(nnet)
library(future)
library(doParallel)
library(e1071)
library(nnet)

1. DATA DOWNLOAD

The data will be downloaded using the link from Coursera page.

rr fileLinkTraining <- ://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv 

fileLinkTest <- ://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv

harTraining <- read.csv(fileLinkTraining)

harTest <- read.csv(fileLinkTest)

2. DATA WRANGLING

First columns from both dataset (user name, time stamps) will be remove since the model shall be independent from the person who uses it and it is not a time series. Data on test set has several empty columns and “pure NA´s” columns. This ones will be removed from both training set. Columns with no variation at test set will also be removed.

rr not_all_na <- function(x) any(!is.na(x)) # function to determine if the column has all values like NA´s

harTstClean <- harTest %>% select(-c(, _name, _timestamp_part_1, _timestamp_part_2, _timestamp, _id, _window)) %>% select_if(not_all_na) %>% select_if(~n_distinct(.) > 1)

harTrnClean <- harTraining[ ,c(names(harTstClean), )]

The training dataset will be split to allow model test and validation at a proportion to 60/20/20 %.

inVal = createDataPartition(harTrnClean$classe, p = 0.2, list = F)

val <- harTrnClean[inVal, ]

model <- harTrnClean[-inVal, ]

inTrain <- createDataPartition(model$classe, p = (0.6/0.8), list = F)

train <- model[inTrain, ]

test <- model[-inTrain, ]

3. EDA

3.1 Null model performance

Pre-model tasks are related evaluate the Null Model predictions. This will be accomplished considering the most frequent class in all “predictions”, generating a lower limit for any model that will be created, as suggest by Zumel and Mount (2014).

nullPred <- test %>% select("classe")

nullPred$pred.class <- names(sort(table(nullPred$classe), decreasing = TRUE)[1])

print(confusionMatrix(as.factor(nullPred$pred.class), reference = as.factor(nullPred$classe)))
Confusion Matrix and Statistics

          Reference
Prediction    A    B    C    D    E
         A 1116  759  684  643  721
         B    0    0    0    0    0
         C    0    0    0    0    0
         D    0    0    0    0    0
         E    0    0    0    0    0

Overall Statistics
                                          
               Accuracy : 0.2845          
                 95% CI : (0.2704, 0.2989)
    No Information Rate : 0.2845          
    P-Value [Acc > NIR] : 0.506           
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: A Class: B Class: C Class: D Class: E
Sensitivity            1.0000   0.0000   0.0000   0.0000   0.0000
Specificity            0.0000   1.0000   1.0000   1.0000   1.0000
Pos Pred Value         0.2845      NaN      NaN      NaN      NaN
Neg Pred Value            NaN   0.8065   0.8256   0.8361   0.8162
Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
Detection Rate         0.2845   0.0000   0.0000   0.0000   0.0000
Detection Prevalence   1.0000   0.0000   0.0000   0.0000   0.0000
Balanced Accuracy      0.5000   0.5000   0.5000   0.5000   0.5000

3.2 Covariates correlation

The multicolinearity (covariates correlation) will be investigated, since it can be harmful for some kind of models, like logistic regression. To investigate, a the heatmaply_cor (from package heatmaply) will be user so related covariates will also be grouped together using a hierarchical cluster technique.

corrMat <- harTrnClean %>% select(-classe) %>% mutate_if(is.integer, as.numeric) %>% cor()

heatmaply(corrMat, symm = TRUE, cexRow = .0001, cexCol = .0001, branches_lwd = .1)

For the plot, it is possible to see that very few covariates presents correlation.

pcaCov <- harTrnClean %>% select(-classe) %>% PCA(scale.unit = TRUE, graph = FALSE)

get_eigenvalue(pcaCov)
        eigenvalue variance.percent cumulative.variance.percent
Dim.1  8.356480510      16.07015483                    16.07015
Dim.2  8.103311777      15.58329188                    31.65345
Dim.3  4.676019495       8.99234518                    40.64579
Dim.4  4.129637592       7.94161075                    48.58740
Dim.5  3.651958340       7.02299681                    55.61040
Dim.6  3.003559604       5.77607616                    61.38648
Dim.7  2.239960734       4.30761680                    65.69409
Dim.8  2.072819572       3.98619149                    69.68028
Dim.9  1.717230735       3.30236680                    72.98265
Dim.10 1.508821495       2.90157980                    75.88423
Dim.11 1.385497930       2.66441910                    78.54865
Dim.12 1.129241536       2.17161834                    80.72027
Dim.13 0.986674562       1.89745108                    82.61772
Dim.14 0.890702735       1.71288987                    84.33061
Dim.15 0.836058641       1.60780508                    85.93841
Dim.16 0.789251336       1.51779103                    87.45620
Dim.17 0.677935082       1.30372131                    88.75993
Dim.18 0.609720195       1.17253884                    89.93247
Dim.19 0.532431274       1.02390630                    90.95637
Dim.20 0.484840952       0.93238645                    91.88876
Dim.21 0.425640834       0.81854007                    92.70730
Dim.22 0.398595212       0.76652925                    93.47383
Dim.23 0.382694691       0.73595133                    94.20978
Dim.24 0.339300889       0.65250171                    94.86228
Dim.25 0.307706521       0.59174331                    95.45402
Dim.26 0.292964236       0.56339276                    96.01742
Dim.27 0.255991614       0.49229157                    96.50971
Dim.28 0.236252841       0.45433239                    96.96404
Dim.29 0.203445985       0.39124228                    97.35528
Dim.30 0.179889105       0.34594059                    97.70122
Dim.31 0.170113804       0.32714193                    98.02837
Dim.32 0.131742348       0.25335067                    98.28172
Dim.33 0.121832106       0.23429251                    98.51601
Dim.34 0.112447021       0.21624427                    98.73225
Dim.35 0.091981456       0.17688742                    98.90914
Dim.36 0.079718822       0.15330543                    99.06245
Dim.37 0.064211387       0.12348344                    99.18593
Dim.38 0.056537299       0.10872558                    99.29465
Dim.39 0.055188020       0.10613081                    99.40079
Dim.40 0.040801837       0.07846507                    99.47925
Dim.41 0.038103474       0.07327591                    99.55253
Dim.42 0.035457709       0.06818790                    99.62071
Dim.43 0.033727650       0.06486087                    99.68557
Dim.44 0.032215407       0.06195270                    99.74753
Dim.45 0.028716975       0.05522495                    99.80275
Dim.46 0.026853460       0.05164127                    99.85439
Dim.47 0.021661899       0.04165750                    99.89605
Dim.48 0.020595887       0.03960747                    99.93566
Dim.49 0.013471638       0.02590700                    99.96157
Dim.50 0.011875304       0.02283712                    99.98440
Dim.51 0.005961537       0.01146449                    99.99587
Dim.52 0.002148931       0.00413256                   100.00000
fviz_eig(pcaCov)

Although the correlation between covariates is not big, a principal components analysis show that with only 10 components (from 52) it is possible to explain 90% of the total variation.

4. MODELING

All modeling will be done considering a parallel computation using doParallel package.

workers <- availableCores()

cl <- makeClusterPSOCK(workers)

4.1 Multinomial Regression

The first model that will be tested it is multinomial regression. Two models will be done: with and without principal component as a pre-processing.

registerDoParallel(cl)
Warning messages:
1: In .Internal(parent.frame(n)) :
  encerrando conexão não utilizada 6 (<-localhost:11340)
2: In .Internal(parent.frame(n)) :
  encerrando conexão não utilizada 5 (<-localhost:11340)
3: In .Internal(parent.frame(n)) :
  encerrando conexão não utilizada 4 (<-localhost:11340)
4: In .Internal(parent.frame(n)) :
  encerrando conexão não utilizada 3 (<-localhost:11340)
mdlLrOr <- train(classe ~., data = harTrnClean, method = 'multinom')
# weights:  270 (212 variable)
initial  value 31580.390718 
iter  10 value 26968.798785
iter  20 value 23633.429794
iter  30 value 21168.462012
iter  40 value 19698.814293
iter  50 value 18847.381722
iter  60 value 18162.031080
iter  70 value 17859.829680
iter  80 value 17577.328793
iter  90 value 17420.724013
iter 100 value 17295.100054
final  value 17295.100054 
stopped after 100 iterations
stopCluster(cl)

5. FINAL MODEL EVALUATION

LS0tCnRpdGxlOiAiSEFSIENvdXJzZXJhIEpvaG7CtHMgSG9wa2lucyBQcmF0aWNhbCBNTCIKYXV0aG9yOiAiU2FtdWVsIEJvenppIEJhY28iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KIyMgMC4gUEFDS0FHRSBMT0FEIEFORCBNQVJLRE9XTiBDT05GSUdVUkFUSU9OCgpgYGB7ciBQQUNLQUdFIExPQUQsIGVjaG89VCwgbWVzc2FnZT1GLCB3YXJuaW5nPUZ9CmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkoZnV0dXJlKQpsaWJyYXJ5KGRvUGFyYWxsZWwpCmxpYnJhcnkoaGVhdG1hcGx5KQpsaWJyYXJ5KGZhY3RvZXh0cmEpCmxpYnJhcnkoRmFjdG9NaW5lUikKbGlicmFyeShubmV0KQpsaWJyYXJ5KGZ1dHVyZSkKbGlicmFyeShkb1BhcmFsbGVsKQpsaWJyYXJ5KGUxMDcxKQpsaWJyYXJ5KG5uZXQpCmBgYAojIyAxLiBEQVRBIERPV05MT0FECgpUaGUgZGF0YSB3aWxsIGJlIGRvd25sb2FkZWQgdXNpbmcgdGhlIGxpbmsgZnJvbSBDb3Vyc2VyYSBwYWdlLgpgYGB7ciBEQVRBIERPV05MT0FELCBjYWNoZSA9IFRSVUV9CmZpbGVMaW5rVHJhaW5pbmcgPC0gImh0dHBzOi8vZDM5NnF1c3phNDBvcmMuY2xvdWRmcm9udC5uZXQvcHJlZG1hY2hsZWFybi9wbWwtdHJhaW5pbmcuY3N2IiAKCmZpbGVMaW5rVGVzdCA8LSAiaHR0cHM6Ly9kMzk2cXVzemE0MG9yYy5jbG91ZGZyb250Lm5ldC9wcmVkbWFjaGxlYXJuL3BtbC10ZXN0aW5nLmNzdiIKCmhhclRyYWluaW5nIDwtIHJlYWQuY3N2KGZpbGVMaW5rVHJhaW5pbmcpCgpoYXJUZXN0IDwtIHJlYWQuY3N2KGZpbGVMaW5rVGVzdCkKYGBgCiMjIDIuIERBVEEgV1JBTkdMSU5HCgpGaXJzdCBjb2x1bW5zIGZyb20gYm90aCBkYXRhc2V0ICh1c2VyIG5hbWUsIHRpbWUgc3RhbXBzKSB3aWxsIGJlIHJlbW92ZSBzaW5jZSB0aGUgbW9kZWwgc2hhbGwgYmUgaW5kZXBlbmRlbnQgZnJvbSB0aGUgcGVyc29uIHdobyB1c2VzIGl0IGFuZCBpdCBpcyBub3QgYSB0aW1lIHNlcmllcy4gRGF0YSBvbiB0ZXN0IHNldCBoYXMgc2V2ZXJhbCBlbXB0eSBjb2x1bW5zIGFuZCDigJxwdXJlIE5BwrRz4oCdIGNvbHVtbnMuIFRoaXMgb25lcyB3aWxsIGJlIHJlbW92ZWQgZnJvbSBib3RoIHRyYWluaW5nIHNldC4gQ29sdW1ucyB3aXRoIG5vIHZhcmlhdGlvbiBhdCB0ZXN0IHNldCB3aWxsIGFsc28gYmUgcmVtb3ZlZC4KCmBgYHtyIENPTFVNTiBTRUxFQ1RJT059Cm5vdF9hbGxfbmEgPC0gZnVuY3Rpb24oeCkgYW55KCFpcy5uYSh4KSkgIyBmdW5jdGlvbiB0byBkZXRlcm1pbmUgaWYgdGhlIGNvbHVtbiBoYXMgYWxsIHZhbHVlcyBsaWtlIE5BwrRzCgpoYXJUc3RDbGVhbiA8LQogICAgICAgIGhhclRlc3QgJT4lCiAgICAgICAgc2VsZWN0KC1jKCJYIiwgCiAgICAgICAgICAgICAgICAgICJ1c2VyX25hbWUiLCAKICAgICAgICAgICAgICAgICAgInJhd190aW1lc3RhbXBfcGFydF8xIiwgCiAgICAgICAgICAgICAgICAgICJyYXdfdGltZXN0YW1wX3BhcnRfMiIsIAogICAgICAgICAgICAgICAgICAiY3Z0ZF90aW1lc3RhbXAiLAogICAgICAgICAgICAgICAgICAicHJvYmxlbV9pZCIsCiAgICAgICAgICAgICAgICAgICJudW1fd2luZG93IikpICU+JQogICAgICAgIHNlbGVjdF9pZihub3RfYWxsX25hKSAlPiUKICAgICAgICBzZWxlY3RfaWYofm5fZGlzdGluY3QoLikgPiAxKQoKaGFyVHJuQ2xlYW4gPC0gaGFyVHJhaW5pbmdbICxjKG5hbWVzKGhhclRzdENsZWFuKSwgImNsYXNzZSIpXQpgYGAKVGhlIHRyYWluaW5nIGRhdGFzZXQgd2lsbCBiZSBzcGxpdCB0byBhbGxvdyBtb2RlbCB0ZXN0IGFuZCB2YWxpZGF0aW9uIGF0IGEgcHJvcG9ydGlvbiB0byA2MC8yMC8yMCAlLiAKYGBge3IgREFUQVNFVCBTUExJVH0KaW5WYWwgPSBjcmVhdGVEYXRhUGFydGl0aW9uKGhhclRybkNsZWFuJGNsYXNzZSwgcCA9IDAuMiwgbGlzdCA9IEYpCgp2YWwgPC0gaGFyVHJuQ2xlYW5baW5WYWwsIF0KCm1vZGVsIDwtIGhhclRybkNsZWFuWy1pblZhbCwgXQoKaW5UcmFpbiA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKG1vZGVsJGNsYXNzZSwgcCA9ICgwLjYvMC44KSwgbGlzdCA9IEYpCgp0cmFpbiA8LSBtb2RlbFtpblRyYWluLCBdCgp0ZXN0IDwtIG1vZGVsWy1pblRyYWluLCBdCmBgYAojIyAzLiBFREEKCiMjIyAzLjEgTnVsbCBtb2RlbCBwZXJmb3JtYW5jZQoKUHJlLW1vZGVsIHRhc2tzIGFyZSByZWxhdGVkIGV2YWx1YXRlIHRoZSBOdWxsIE1vZGVsIHByZWRpY3Rpb25zLiBUaGlzIHdpbGwgYmUgYWNjb21wbGlzaGVkIGNvbnNpZGVyaW5nIHRoZSBtb3N0IGZyZXF1ZW50IGNsYXNzIGluIGFsbCDigJxwcmVkaWN0aW9uc+KAnSwgZ2VuZXJhdGluZyBhIGxvd2VyIGxpbWl0IGZvciBhbnkgbW9kZWwgdGhhdCB3aWxsIGJlIGNyZWF0ZWQsIGFzIHN1Z2dlc3QgYnkgWnVtZWwgYW5kIE1vdW50ICgyMDE0KS4gCmBgYHtyIE5VTEwgTU9ERUwgUEVSRk9STUFOQ0UsIG1lc3NhZ2U9Riwgd2FybmluZz1GfQpudWxsUHJlZCA8LSB0ZXN0ICU+JSBzZWxlY3QoImNsYXNzZSIpCgpudWxsUHJlZCRwcmVkLmNsYXNzIDwtIG5hbWVzKHNvcnQodGFibGUobnVsbFByZWQkY2xhc3NlKSwgZGVjcmVhc2luZyA9IFRSVUUpWzFdKQoKcHJpbnQoY29uZnVzaW9uTWF0cml4KGFzLmZhY3RvcihudWxsUHJlZCRwcmVkLmNsYXNzKSwgcmVmZXJlbmNlID0gYXMuZmFjdG9yKG51bGxQcmVkJGNsYXNzZSkpKQpgYGAKCiMgMy4yIENvdmFyaWF0ZXMgY29ycmVsYXRpb24KClRoZSBtdWx0aWNvbGluZWFyaXR5IChjb3ZhcmlhdGVzIGNvcnJlbGF0aW9uKSB3aWxsIGJlIGludmVzdGlnYXRlZCwgc2luY2UgaXQgY2FuIGJlIGhhcm1mdWwgZm9yIHNvbWUga2luZCBvZiBtb2RlbHMsIGxpa2UgbG9naXN0aWMgcmVncmVzc2lvbi4gVG8gaW52ZXN0aWdhdGUsIGEgdGhlIGhlYXRtYXBseV9jb3IgKGZyb20gcGFja2FnZSBoZWF0bWFwbHkpIHdpbGwgYmUgdXNlciBzbyByZWxhdGVkIGNvdmFyaWF0ZXMgd2lsbCBhbHNvIGJlIGdyb3VwZWQgdG9nZXRoZXIgdXNpbmcgYSBoaWVyYXJjaGljYWwgY2x1c3RlciB0ZWNobmlxdWUuIAoKYGBge3IgTVVMVElDT0xJTkVBUklUWX0KY29yck1hdCA8LSBoYXJUcm5DbGVhbiAlPiUgc2VsZWN0KC1jbGFzc2UpICU+JSBtdXRhdGVfaWYoaXMuaW50ZWdlciwgYXMubnVtZXJpYykgJT4lIGNvcigpCgpoZWF0bWFwbHkoY29yck1hdCwgc3ltbSA9IFRSVUUsIGNleFJvdyA9IC4wMDAxLCBjZXhDb2wgPSAuMDAwMSwgYnJhbmNoZXNfbHdkID0gLjEpCmBgYAoKRm9yIHRoZSBwbG90LCBpdCBpcyBwb3NzaWJsZSB0byBzZWUgdGhhdCB2ZXJ5IGZldyBjb3ZhcmlhdGVzIHByZXNlbnRzIGNvcnJlbGF0aW9uLiAKCmBgYHtyIFBDQSBDT1ZBUklBVEVTfQpwY2FDb3YgPC0gaGFyVHJuQ2xlYW4gJT4lIHNlbGVjdCgtY2xhc3NlKSAlPiUgUENBKHNjYWxlLnVuaXQgPSBUUlVFLCBncmFwaCA9IEZBTFNFKQoKZ2V0X2VpZ2VudmFsdWUocGNhQ292KQoKZnZpel9laWcocGNhQ292KQpgYGAKQWx0aG91Z2ggdGhlIGNvcnJlbGF0aW9uIGJldHdlZW4gY292YXJpYXRlcyBpcyBub3QgYmlnLCBhIHByaW5jaXBhbCBjb21wb25lbnRzIGFuYWx5c2lzIHNob3cgdGhhdCB3aXRoIG9ubHkgMTAgY29tcG9uZW50cyAoZnJvbSA1MikgaXQgaXMgcG9zc2libGUgdG8gZXhwbGFpbiA5MCUgb2YgdGhlIHRvdGFsIHZhcmlhdGlvbi4KCiMjIDQuIE1PREVMSU5HCgpBbGwgbW9kZWxpbmcgd2lsbCBiZSBkb25lIGNvbnNpZGVyaW5nIGEgcGFyYWxsZWwgY29tcHV0YXRpb24gdXNpbmcgKipkb1BhcmFsbGVsKiogcGFja2FnZS4KYGBge3IgUEFSQUxFTExJWkFUSU9OfQp3b3JrZXJzIDwtIGF2YWlsYWJsZUNvcmVzKCkKCmNsIDwtIG1ha2VDbHVzdGVyUFNPQ0sod29ya2VycykKYGBgCgoKIyMjIDQuMSBNdWx0aW5vbWlhbCBSZWdyZXNzaW9uCgpUaGUgZmlyc3QgbW9kZWwgdGhhdCB3aWxsIGJlIHRlc3RlZCBpdCBpcyBtdWx0aW5vbWlhbCByZWdyZXNzaW9uLiBUd28gbW9kZWxzIHdpbGwgYmUgZG9uZTogd2l0aCBhbmQgd2l0aG91dCBwcmluY2lwYWwgY29tcG9uZW50IGFzIGEgcHJlLXByb2Nlc3NpbmcuCgpgYGB7ciBNVUxUSU5PTUlBTCBSRUdSRVNTSU9OLCB3YXJuaW5nPUYsIG1lc3NhZ2U9Rn0KcmVnaXN0ZXJEb1BhcmFsbGVsKGNsKQoKbWRsTHJPciA8LSB0cmFpbihjbGFzc2Ugfi4sIGRhdGEgPSBoYXJUcm5DbGVhbiwgbWV0aG9kID0gJ211bHRpbm9tJykKCnN0b3BDbHVzdGVyKGNsKQpgYGAKCiAgICAgICAgCiMjIDUuIEZJTkFMIE1PREVMIEVWQUxVQVRJT04=